{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "81fc9f42-5e36-4055-9499-d2d45eb0c6d8",
   "metadata": {},
   "source": [
    "<center><h1>A data-driven approach for the assessment of the thermal stratification of reservoirs based on readily available data.</h1> "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "251e1718-548d-4a0c-8158-4d7b3a874330",
   "metadata": {},
   "source": [
    "<center><h4><span style='color:royalblue'> María Castrillo </span>, Fernando Aguilar, Daniel García\n",
    "<center> (castrillo@ifca.unican.es)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "026756b1-2ad0-46b4-826e-3c83af4bf19a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import libraries and pack\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "import seaborn as sns\n",
    "import scipy\n",
    "import sklearn\n",
    "import mlxtend\n",
    "import tensorflow as tf\n",
    "import shap\n",
    "import scikeras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "174d05fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scikeras.wrappers import KerasRegressor, KerasClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad8bc430",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow import keras\n",
    "from keras import optimizers\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation, LSTM, Dropout, SimpleRNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac9d5237",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.model_selection import KFold, cross_val_score, cross_validate\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.metrics import mean_absolute_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85105a1d-ac14-4896-a59c-33fbe4357bd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams[\"font.family\"] = \"serif\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f83fb22c",
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.keras.utils.set_random_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3803857",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Read the data\n",
    "data = pd.read_csv(\"dailyElVal.csv\", sep=\",\")\n",
    "data[\"Date\"] = pd.to_datetime(data[\"Date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6648c3ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "182a7b9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c012339",
   "metadata": {},
   "source": [
    "## Outliers detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "765913b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#After visual inspection we eliminate observations with RadMax > 800 and RadAvg < 180, which are obvious outliers in this time series.\n",
    "data = data[data[\"RadMax\"] < 800]\n",
    "data = data[data[\"RadAvg\"] < 180]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b6e25d9-0514-4f87-9fa8-ff6bb5e9a5f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b63567aa-6f6a-4f33-9865-18164ed38ce4",
   "metadata": {},
   "source": [
    "## Violin plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2816d36f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [\"AirTempMax\", \"AirTempMin\", \"AirTempAvg\", \"Wind\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\", \"Flow\"]\n",
    "\n",
    "dark_gray = \"#000000\"\n",
    "light_gray = \"#eeeeee\"\n",
    "\n",
    "fig, axs = plt.subplots(2, 5, figsize=(12, 6), layout='constrained')\n",
    "\n",
    "for i, col in enumerate(cols): \n",
    "    ax = axs.flat[i]\n",
    "    sns.violinplot(y=data[col], ax=ax,  color=\"lightsalmon\")\n",
    "    ax.spines[\"top\"].set_visible(False)\n",
    "    ax.spines[\"right\"].set_visible(False)\n",
    "    ax.spines[\"left\"].set_color(dark_gray)\n",
    "    ax.spines[\"bottom\"].set_color(dark_gray)\n",
    "    ax.set_facecolor('xkcd:white')\n",
    "    \n",
    "    ax.get_yaxis().tick_left()\n",
    "    ax.get_xaxis().tick_bottom()\n",
    "    \n",
    "    ax.tick_params(axis=\"both\", which=\"major\", labelsize=12, colors=dark_gray)\n",
    "    ax.tick_params(axis=\"both\", which=\"minor\", labelsize=10, colors=dark_gray)\n",
    "    ax.xaxis.label.set_color(dark_gray)\n",
    "    ax.set_ylabel(cols[i], fontsize=14)\n",
    "    \n",
    "fig.delaxes(ax)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acd2ea09",
   "metadata": {},
   "source": [
    "## Data shifts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7089735",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[[\"AirTempAvg1\", \"Wind1\", \"AirTempMin1\", \"AirTempMax1\", \"RadMax1\", \"RadAvg1\", \"Rain1\", \"Level1\", \"Flow1\"]]=data[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]].shift(1, axis = 0)\n",
    "data[[\"AirTempAvg2\", \"Wind2\", \"AirTempMin2\", \"AirTempMax2\", \"RadMax2\", \"RadAvg2\", \"Rain2\", \"Level2\", \"Flow2\"]]=data[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]].shift(2, axis = 0)\n",
    "data[[\"AirTempAvg3\", \"Wind3\", \"AirTempMin3\", \"AirTempMax3\", \"RadMax3\", \"RadAvg3\", \"Rain3\", \"Level3\", \"Flow3\"]]=data[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]].shift(3, axis = 0)\n",
    "data[[\"AirTempAvg4\", \"Wind4\", \"AirTempMin4\", \"AirTempMax4\", \"RadMax4\", \"RadAvg4\", \"Rain4\", \"Level4\", \"Flow4\"]]=data[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]].shift(4, axis = 0)\n",
    "data[[\"AirTempAvg5\", \"Wind5\", \"AirTempMin5\", \"AirTempMax5\", \"RadMax5\", \"RadAvg5\", \"Rain5\", \"Level5\", \"Flow5\"]]=data[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]].shift(5, axis = 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fcf38094-bc55-44e1-9d3b-9048294ffa86",
   "metadata": {},
   "source": [
    "# Modelling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4f8d223",
   "metadata": {},
   "source": [
    "## Unsupervised detection of the thermocline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaf7559b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#data[\"Date\"]  = pd.to_datetime(data[\"Date\"], format = \"%Y-%M-%d\").dt.normalize()\n",
    "#Separate train and test fractions\n",
    "data2 = data.drop(columns = [\"Thermocline_depth\", \"Thermocline_temp\"])\n",
    "train = data2[data2[\"Date\"] <= \"2021-12-31\"].dropna()\n",
    "test = data2[data2[\"Date\"] >= \"2022-01-01\"].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "875dca74",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define inputs and target variable\n",
    "X = train[[\"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\"]]\n",
    "y = train[\"Thermocline\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07f5c58b-c169-4a21-b0b5-50b966d62211",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Scale the input\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bb20054",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define and train the model\n",
    "kmeans = KMeans(n_clusters= 2, \n",
    "            random_state=42)\n",
    "\n",
    "kmeans.fit(X_scaled)\n",
    "\n",
    "#Predict the clusters\n",
    "kmeans_pred = kmeans.predict(X_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11d332c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans_pred = pd.DataFrame(kmeans_pred).set_index(y.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af85e6f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Obtain clusters centers\n",
    "kmeans.cluster_centers_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3eb575c",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.merge(kmeans_pred, left_index=True, right_index=True)\n",
    "train = train.rename(columns = {'0_x':\"kmeans\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7045f760",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12,6))\n",
    "\n",
    "plt.plot(train.Date, (train.Gradient), linewidth=0.5, color='black')\n",
    "plt.plot(train.Date[train.kmeans==0], (train.Gradient[train.kmeans==0]), \n",
    "         linestyle=\" \", marker=\"D\", markersize=3, color='coral', label=\"Cluster 0\")\n",
    "plt.plot(train.Date[train.kmeans==1], (train.Gradient[train.kmeans==1]), \n",
    "         linestyle=\" \", marker=\"o\", markersize=4,  color='slateblue', label=\"Cluster 1\")\n",
    "\n",
    "plt.xlabel('Time', fontsize = 14)\n",
    "plt.ylabel('Thermal gradient 1 m around Z', fontsize = 14)\n",
    "plt.xticks(())\n",
    "plt.yticks(np.arange(0,15,1))\n",
    "plt.hlines(y=0.3, xmin=17500, xmax=19000, colors='red', linestyles='-', lw=1, label=\"0.3 \\u2103 $\\mathregular{m^{-1}}$\")\n",
    "plt.legend()\n",
    "\n",
    "dark_gray = \"#000000\"\n",
    "light_gray = \"#eeeeee\"\n",
    "\n",
    "ax = fig.axes[0]\n",
    "ax.spines[\"top\"].set_visible(False)\n",
    "ax.spines[\"right\"].set_visible(False)\n",
    "ax.spines[\"left\"].set_color(dark_gray)\n",
    "ax.spines[\"bottom\"].set_color(dark_gray)\n",
    "\n",
    "ax.yaxis.grid(True, color=light_gray)\n",
    "ax.xaxis.grid(True, color=light_gray)\n",
    "\n",
    "import matplotlib.dates as mdates\n",
    "ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 7)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb238c41",
   "metadata": {},
   "source": [
    "## Classification neural network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fd32fb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train[[\n",
    "                 \"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\",\n",
    "                 \"AirTempAvg1\", \"Wind1\", \"AirTempMin1\", \"AirTempMax1\", \"RadMax1\", \"RadAvg1\", \"Rain1\", \"Level1\", \"Flow1\",\n",
    "                 \"AirTempAvg2\", \"Wind2\", \"AirTempMin2\", \"AirTempMax2\", \"RadMax2\", \"RadAvg2\", \"Rain2\", \"Level2\", \"Flow2\",\n",
    "                 \"AirTempAvg3\", \"Wind3\", \"AirTempMin3\", \"AirTempMax3\", \"RadMax3\", \"RadAvg3\", \"Rain3\", \"Level3\", \"Flow3\",\n",
    "                 \"AirTempAvg4\", \"Wind4\", \"AirTempMin4\", \"AirTempMax4\", \"RadMax4\", \"RadAvg4\", \"Rain4\", \"Level4\", \"Flow4\",\n",
    "                 \"AirTempAvg5\", \"Wind5\", \"AirTempMin5\", \"AirTempMax5\", \"RadMax5\", \"RadAvg5\", \"Rain5\", \"Level5\", \"Flow5\",\n",
    "                ]]\n",
    "y_train = train[\"Thermocline\"]\n",
    "X_test = test[[        \n",
    "                 \"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\",  \"Level\", \"Flow\",\n",
    "                 \"AirTempAvg1\", \"Wind1\", \"AirTempMin1\", \"AirTempMax1\", \"RadMax1\", \"RadAvg1\", \"Rain1\", \"Level1\", \"Flow1\", \n",
    "                 \"AirTempAvg2\", \"Wind2\", \"AirTempMin2\", \"AirTempMax2\", \"RadMax2\", \"RadAvg2\", \"Rain2\", \"Level2\", \"Flow2\",\n",
    "                 \"AirTempAvg3\", \"Wind3\", \"AirTempMin3\", \"AirTempMax3\", \"RadMax3\", \"RadAvg3\", \"Rain3\", \"Level3\", \"Flow3\",\n",
    "                 \"AirTempAvg4\", \"Wind4\", \"AirTempMin4\", \"AirTempMax4\", \"RadMax4\", \"RadAvg4\", \"Rain4\", \"Level4\", \"Flow4\",\n",
    "                 \"AirTempAvg5\", \"Wind5\", \"AirTempMin5\", \"AirTempMax5\", \"RadMax5\", \"RadAvg5\", \"Rain5\", \"Level5\", \"Flow5\",\n",
    "              ]]\n",
    "y_test = test[\"Thermocline\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "439e04d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Scaling\n",
    "scaler = StandardScaler()\n",
    "\n",
    "X_train[X_train.columns] = scaler.fit_transform(X_train)\n",
    "X_test[X_test.columns] = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5464402",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define the architecture of the NN\n",
    "NN_class = Sequential([\n",
    "    Dense(32, input_shape=(len(X_train.columns),)),\n",
    "    Activation('relu'),\n",
    "     Dropout(0.2),\n",
    "    Dense(16),\n",
    "    Activation(\"relu\"),\n",
    "    Dropout(0.2),\n",
    "    Dense(1), \n",
    "    Activation('sigmoid'),\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28545de0",
   "metadata": {},
   "source": [
    "### Cassification neural network with cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58e078a3-9769-4daf-a036-7b4448519395",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Compile the NN\n",
    "metric = tf.keras.metrics.BinaryAccuracy(\n",
    "    name=\"binary_accuracy\", dtype=None, threshold=0.5\n",
    ")\n",
    "\n",
    "opt = keras.optimizers.Adam(learning_rate=0.0001)\n",
    "\n",
    "NN_class.compile(loss = \"binary_crossentropy\",\n",
    "                optimizer = opt, \n",
    "                metrics=metric)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "290da104",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = KerasClassifier(NN_class, batch_size=32, verbose=1, epochs=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "495932c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "scoring = {'acc': 'accuracy',\n",
    "           'recall': 'recall',\n",
    "           'auc': 'roc_auc'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93475533",
   "metadata": {},
   "outputs": [],
   "source": [
    "kfold = KFold(n_splits=4, shuffle=False)\n",
    "results = cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "605fe779",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Cross-validation results\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4070f252",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Averaged results\n",
    "print(\"Accuracy: \" + str(np.mean(results[\"test_acc\"])))\n",
    "print(\"Recall: \" + str(np.mean(results[\"test_recall\"])))\n",
    "print(\"AUC: \" + str(np.mean(results[\"test_auc\"])))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed2f89c8",
   "metadata": {},
   "source": [
    "### Cassification neural network without cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5699b65-5692-4a76-a30a-1ea62c6a48d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Compile the NN\n",
    "metric = tf.keras.metrics.BinaryAccuracy(\n",
    "    name=\"binary_accuracy\", dtype=None, threshold=0.5\n",
    ")\n",
    "\n",
    "opt = keras.optimizers.Adam(learning_rate=0.0001)\n",
    "\n",
    "NN_class.compile(loss = \"binary_crossentropy\",\n",
    "                optimizer = opt, \n",
    "                metrics=metric)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc2c93e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Train the NN with validation split\n",
    "log_class = NN_class.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb96b7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Plot the train accuracy and validation accuracy\n",
    "plt.plot(log_class.history['binary_accuracy'])\n",
    "plt.plot(log_class.history['val_binary_accuracy'])\n",
    "plt.legend(['accuracy','validation accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "150b807c",
   "metadata": {},
   "outputs": [],
   "source": [
    "NN_class.fit(X_train, y_train, epochs=200, batch_size=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40187ab0",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = NN_class.evaluate(X_train, y_train)\n",
    "print(\"%s: %.2f%%\" % (NN_class.metrics_names[1], scores[1]*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88ed68e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = NN_class.evaluate(X_test, y_test)\n",
    "print(\"%s: %.2f%%\" % (NN_class.metrics_names[1], scores[1]*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93473067",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = NN_class.predict(X_train)\n",
    "y_pred = NN_class.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "616a3d27",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8,6))\n",
    "cm_train = confusion_matrix(y_train, y_pred_train.round())  #target and predictions\n",
    "display = ConfusionMatrixDisplay(cm_train)\n",
    "ax.set(title='Train Confusion Matrix')\n",
    "display.plot(ax=ax)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8,6))\n",
    "cm_test = confusion_matrix(y_test, y_pred.round())\n",
    "display = ConfusionMatrixDisplay(cm_test)\n",
    "ax.set(title='Test Confusion Matrix')\n",
    "display.plot(ax=ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6236b5fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import RocCurveDisplay\n",
    "\n",
    "RocCurveDisplay.from_predictions(y_train, y_pred_train)\n",
    "RocCurveDisplay.from_predictions(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dce5bb0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import recall_score\n",
    "\n",
    "recall_train = recall_score(y_train, y_pred_train.round())\n",
    "recall_test = recall_score(y_test, y_pred.round())\n",
    "\n",
    "print(recall_train)\n",
    "print(recall_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99214dc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = pd.Series(np.array(y_pred_train.round(0).reshape(-1-1)))\n",
    "y_pred_train.index = y_train.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68de9ddc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = pd.Series(np.array(y_pred.round(0).reshape(-1-1)))\n",
    "y_pred.index = y_test.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b2eb426",
   "metadata": {},
   "outputs": [],
   "source": [
    "yy_train = pd.concat((data[\"Date\"], y_train), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_pred_train = pd.concat((data[\"Date\"], y_pred_train), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_test = pd.concat((data[\"Date\"], y_test), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_pred = pd.concat((data[\"Date\"], y_pred), axis=1).set_index(\"Date\", drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b72875d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12,3))\n",
    "\n",
    "plt.plot(yy_train+0.05, linewidth=0.0, marker = \"o\", markersize = 7, color=\"darkslateblue\")\n",
    "plt.plot(yy_test+0.05, linewidth=0.0, marker = \"o\", markersize = 7,label=\"Real data\", color=\"darkslateblue\")\n",
    "plt.plot(yy_pred_train-0.05, linewidth=0.0, marker = \"o\", markersize = 7,  color=\"lightsalmon\")\n",
    "plt.plot(yy_pred-0.05, linewidth=0.0, marker = \"o\", markersize = 7, label=\"Predicted data\", color=\"lightsalmon\")\n",
    "\n",
    "plt.axvline(x = 18994, color = 'black', ls='--', lw=1)\n",
    "\n",
    "plt.xlabel(\"Date\", fontsize = 14)\n",
    "plt.ylabel(\"Thermocline presence\", fontsize = 14)\n",
    "plt.legend(loc=\"center left\", facecolor=\"white\")\n",
    "\n",
    "dark_gray = \"#000000\"\n",
    "light_gray = \"#eeeeee\"\n",
    "\n",
    "ax = fig.axes[0]\n",
    "ax.spines[\"top\"].set_visible(False)\n",
    "ax.spines[\"right\"].set_visible(False)\n",
    "ax.spines[\"left\"].set_color(dark_gray)\n",
    "ax.spines[\"bottom\"].set_color(dark_gray)\n",
    "\n",
    "ax.set_facecolor('xkcd:white')\n",
    "\n",
    "ax.get_yaxis().tick_left()\n",
    "ax.get_xaxis().tick_bottom()\n",
    "\n",
    "ax.tick_params(axis=\"both\", which=\"major\", labelsize=10, colors=dark_gray)\n",
    "ax.tick_params(axis=\"both\", which=\"minor\", labelsize=8, colors=dark_gray)\n",
    "ax.xaxis.label.set_color(dark_gray)\n",
    "ax.yaxis.label.set_color(dark_gray)\n",
    "\n",
    "ax.set_yticks((0,1))\n",
    "\n",
    "ax.yaxis.grid(True, color=light_gray, lw=20)\n",
    "ax.xaxis.grid(True, color=light_gray)\n",
    "\n",
    "ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 7)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a61fe3b7",
   "metadata": {},
   "source": [
    "## Regression neural network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20b30d20",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = data[data[\"Date\"] <= \"2021-12-31\"].dropna()\n",
    "test = data[data[\"Date\"] >= \"2022-01-01\"].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efebd3f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train[[\n",
    "                \"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\", \"Level\", \"Flow\",\n",
    "                \"AirTempAvg1\", \"Wind1\", \"AirTempMin1\", \"AirTempMax1\", \"RadMax1\", \"RadAvg1\", \"Rain1\", \"Level1\", \"Flow1\",\n",
    "                \"AirTempAvg2\", \"Wind2\", \"AirTempMin2\", \"AirTempMax2\", \"RadMax2\", \"RadAvg2\", \"Rain2\", \"Level2\", \"Flow2\",\n",
    "                \"AirTempAvg3\", \"Wind3\", \"AirTempMin3\", \"AirTempMax3\", \"RadMax3\", \"RadAvg3\", \"Rain3\", \"Level3\", \"Flow3\",\n",
    "                \"AirTempAvg4\", \"Wind4\", \"AirTempMin4\", \"AirTempMax4\", \"RadMax4\", \"RadAvg4\", \"Rain4\", \"Level4\", \"Flow4\",\n",
    "                \"AirTempAvg5\", \"Wind5\", \"AirTempMin5\", \"AirTempMax5\", \"RadMax5\", \"RadAvg5\", \"Rain5\", \"Level5\", \"Flow5\",\n",
    "                ]]\n",
    "y_train = train[\"Thermocline_depth\"]\n",
    "X_test = test[[             \n",
    "                \"AirTempAvg\", \"Wind\", \"AirTempMin\", \"AirTempMax\", \"RadMax\", \"RadAvg\", \"Rain\",  \"Level\", \"Flow\",\n",
    "                \"AirTempAvg1\", \"Wind1\", \"AirTempMin1\", \"AirTempMax1\", \"RadMax1\", \"RadAvg1\", \"Rain1\", \"Level1\", \"Flow1\", \n",
    "                \"AirTempAvg2\", \"Wind2\", \"AirTempMin2\", \"AirTempMax2\", \"RadMax2\", \"RadAvg2\", \"Rain2\", \"Level2\", \"Flow2\",\n",
    "                \"AirTempAvg3\", \"Wind3\", \"AirTempMin3\", \"AirTempMax3\", \"RadMax3\", \"RadAvg3\", \"Rain3\", \"Level3\", \"Flow3\",\n",
    "                \"AirTempAvg4\", \"Wind4\", \"AirTempMin4\", \"AirTempMax4\", \"RadMax4\", \"RadAvg4\", \"Rain4\", \"Level4\", \"Flow4\",\n",
    "                \"AirTempAvg5\", \"Wind5\", \"AirTempMin5\", \"AirTempMax5\", \"RadMax5\", \"RadAvg5\", \"Rain5\", \"Level5\", \"Flow5\",\n",
    "              ]]\n",
    "y_test = test[\"Thermocline_depth\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef5a3f97",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scaling\n",
    "scaler = StandardScaler()\n",
    "\n",
    "X_train[X_train.columns] = scaler.fit_transform(X_train)\n",
    "X_test[X_test.columns] = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1aa0fec3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define the architecture of the NN\n",
    "NN_regr = Sequential([\n",
    "    Dense(32, input_shape=(len(X_train.columns),)),\n",
    "    Activation('relu'),\n",
    "    Dropout(0.4),\n",
    "    Dense(16),\n",
    "    Activation('relu'),\n",
    "    Dropout(0.2),\n",
    "    Dense(8),\n",
    "    Activation('relu'),\n",
    "    Dense(1), \n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db7f9519",
   "metadata": {},
   "source": [
    "### Regression neural network with cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e44d315b-a7ee-4717-98d8-fe5eb5598892",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Compile\n",
    "opt = keras.optimizers.Adam(learning_rate=0.001)\n",
    "\n",
    "NN_regr.compile(loss= \"mae\",\n",
    "                optimizer=opt, \n",
    "                metrics= \"mae\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9eb0a15e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = KerasRegressor(NN_regr, batch_size=32, verbose=1, epochs=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c75116d",
   "metadata": {},
   "outputs": [],
   "source": [
    "kfold = KFold(n_splits=4, shuffle=False)\n",
    "results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e85be23",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Cross-validation results\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19590c43",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Averaged results\n",
    "np.mean(abs(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55e96300",
   "metadata": {},
   "source": [
    "### Regression neural network without cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "970cb1de-c48f-4b61-bb2d-5258e57cd68a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Compile\n",
    "opt = keras.optimizers.Adam(learning_rate=0.001)\n",
    "\n",
    "NN_regr.compile(loss= \"mae\",\n",
    "                optimizer=opt, \n",
    "                metrics= \"mae\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7ebc0c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Train the NN with validation split\n",
    "log_rgr = NN_regr.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8b0b7ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Plot the train accuracy and validation accuracy\n",
    "plt.plot(log_rgr.history['mae'])\n",
    "plt.plot(log_rgr.history['val_mae'])\n",
    "plt.legend(['mae','validation mae'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fda11282",
   "metadata": {},
   "outputs": [],
   "source": [
    "NN_regr.fit(X_train, y_train, epochs=500, batch_size=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6471a3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = NN_regr.evaluate(X_train, y_train)\n",
    "print(\"%s: %.2f\" % (NN_regr.metrics_names[1], scores[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a80f253",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = NN_regr.evaluate(X_test, y_test)\n",
    "print(\"%s: %.2f\" % (NN_regr.metrics_names[1], scores[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72dff87f",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = NN_regr.predict(X_train)\n",
    "y_pred = NN_regr.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1f43660",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = pd.Series(np.array(y_pred.reshape(-1-1)))\n",
    "y_pred.index = y_test.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80f833f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = pd.Series(np.array(y_pred_train.reshape(-1-1)))\n",
    "y_pred_train.index = y_train.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f46bc785",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAE_train = np.mean(np.abs((y_train-y_pred_train)))\n",
    "MAE_test = np.mean(np.abs((y_test-y_pred)))\n",
    "P50_train = np.quantile(np.abs(y_train-y_pred_train), 0.50)\n",
    "P50_test = np.quantile(np.abs(y_test-y_pred), 0.50)\n",
    "print(\"P50_train: \" + str(P50_train))\n",
    "print(\"P50_test: \" + str(P50_test))\n",
    "print(\"MAE_train percentile: \" + str((np.sum(np.abs(y_train-y_pred_train) < MAE_train) / len(y_train-y_pred_train) * 100).round(2)))\n",
    "print(\"MAE_test percentile: \" + str((np.sum(np.abs(y_test-y_pred) < MAE_test) / len(y_test-y_pred) * 100).round(2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b746f57",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"MAE_test percentile: \" + str((np.sum(np.abs(y_test-y_pred) < MAE_test) / len(y_test-y_pred) * 100).round(2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2efcfe70",
   "metadata": {},
   "outputs": [],
   "source": [
    "dates_train = data[\"Date\"].iloc[y_train.index]\n",
    "dates_test = data[\"Date\"].iloc[y_test.index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85a5f2a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_high_error = y_pred[abs(y_pred-y_test) > scores[1]]\n",
    "y_pred_low_error = y_pred[abs(y_pred-y_test) <= scores[1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b613dfb",
   "metadata": {},
   "outputs": [],
   "source": [
    "yy_train = pd.concat((data[\"Date\"], y_train), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_pred_train = pd.concat((data[\"Date\"], y_pred_train), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_test = pd.concat((data[\"Date\"], y_test), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_pred_high_error = pd.concat((data[\"Date\"], y_pred_high_error), axis=1).set_index(\"Date\", drop=True)\n",
    "yy_pred_low_error = pd.concat((data[\"Date\"], y_pred_low_error), axis=1).set_index(\"Date\", drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee53fa60",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12,6))\n",
    "\n",
    "plt.plot(data[\"Thermocline_depth\"], linewidth=0, marker = \"o\", markersize = 5, label=\"Real data\", color=\"darkslateblue\")\n",
    "plt.plot(y_pred_train, linewidth=0, marker = \"^\", markersize = 6, label=\"Predicted data (train)\", color=\"#f57d6b\", markeredgecolor=\"darksalmon\")\n",
    "plt.plot(y_pred_high_error, linewidth=0, marker = \"^\", markersize = 6, label=\"Predicted data (test-high)\", color=\"#ffb14e\", markeredgecolor=\"orange\")\n",
    "plt.plot(y_pred_low_error, linewidth=0, marker = \"^\", markersize = 6, label=\"Predicted data (test-low)\", color=\"#ffff52\", markeredgecolor=\"gold\")\n",
    "\n",
    "plt.xlabel(\"Date\", fontsize = 14)\n",
    "plt.ylabel(\"Z (m)\", fontsize = 14)\n",
    "plt.legend(loc=\"upper left\", facecolor=\"white\")\n",
    "\n",
    "dark_gray = \"#000000\"\n",
    "light_gray = \"#eeeeee\"\n",
    "\n",
    "ax = fig.axes[0]\n",
    "ax.spines[\"top\"].set_visible(False)\n",
    "ax.spines[\"right\"].set_visible(False)\n",
    "ax.spines[\"left\"].set_color(dark_gray)\n",
    "ax.spines[\"bottom\"].set_color(dark_gray)\n",
    "\n",
    "ax.set_facecolor('xkcd:white')\n",
    "\n",
    "ax.get_yaxis().tick_left()\n",
    "ax.get_xaxis().tick_bottom()\n",
    "\n",
    "ax.tick_params(axis=\"both\", which=\"major\", labelsize=10, colors=dark_gray)\n",
    "ax.tick_params(axis=\"both\", which=\"minor\", labelsize=8, colors=dark_gray)\n",
    "ax.xaxis.label.set_color(dark_gray)\n",
    "ax.yaxis.label.set_color(dark_gray)\n",
    "\n",
    "ax.yaxis.grid(True, color=light_gray)\n",
    "ax.xaxis.grid(True, color=light_gray)\n",
    "\n",
    "ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 7)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f17825c7-cfa9-4460-ada6-4c54b7a4cf8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12,6))\n",
    "fig, ax1 = plt.subplots()\n",
    "\n",
    "ax2 = ax1.twinx()\n",
    "ax1.plot(data[\"Thermocline_depth\"], linewidth=0, marker = \"o\", markersize = 3, label=\"Real data (train)\", color=\"darkslateblue\")\n",
    "ax2.plot(data[\"Level\"], linewidth=0, marker = \"o\", markersize = 3, label=\"Level\", color=\"red\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9eb32435",
   "metadata": {},
   "source": [
    "## SHAP values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3af6906",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fits the explainer\n",
    "explainer = shap.Explainer(NN_regr.predict, X_train)\n",
    "\n",
    "# Calculates the SHAP values \n",
    "shap_values = explainer(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fdfb5ff6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summarize the effects of features\n",
    "shap.plots.beeswarm(shap_values, color=plt.get_cmap(\"plasma\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1a7114e",
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.plots.bar(shap_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "168c3105",
   "metadata": {},
   "outputs": [],
   "source": [
    "abs(shap_values.values).mean(axis=0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
